import requests
from re import findall
from bs4 import BeautifulSoup
import html2text

# bs4 html2text

#获取微信公众号内容,保存标题和时间
def get_weixin_md(url):
    res=requests.get(url)
    soup=BeautifulSoup(res.text,"html.parser")

    weixin_title=soup.find('h1').string.strip()
    #result=findall(r'[0-9]{4}-[0-9]{2}-[0-9]{2}.+:[0-9]{2}',res.text)
    result=findall(r"(\d{4}-\d{1,2}-\d{1,2})",res.text)
    weixin_time=result[0]

    #获取正文html并修改
    content=soup.find(id='js_content')
    soup2=BeautifulSoup((str(content)),"html.parser")
    soup2.div['style']='visibility: visible;'
    html=str(soup2)
    pattern=r'http[s]?:\/\/[a-z.A-Z_0-9\/\?=-_-]+'
    result = findall(pattern, html)
    
    #将data-src修改为src
    for url in result:
        html=html.replace('data-src="'+url+'"','src="'+url+'"')
    
    return {'title':weixin_title, 'time':weixin_time, 'md': html2text.html2text(html) }


# url = 'https://mp.weixin.qq.com/s?src=11&timestamp=1669000493&ver=4179&signature=b1tUK9kzrJ*O6NX2jMFRsXsNdQEJ-W80eLDTPtZDnQuGQP*ufiQtRAxCA8omIwGWGHQgzsNfMfq4LlK*db2A7UK0godDCqSP2UXuvSg7dAN0euBgZOtVCm4l3w29is5E&new=1'
# markdown = get_weixin_md(url)
# print(markdown)

# with open('test3.md', 'w', encoding='utf-8') as file:
#     file.write(markdown['md'])